* Do file to import and clean census data (by postcode)

* March 2015 (code for 2011). Updated Dec 2018 to include 2006 and 2016 census data. 

clear

set more off


*1. import census csv files, change names and clean

global files Age Bedrooms DwellingStructure Education Employment Income Mortgage Rent Schooling Tenure
*
foreach xx of global files {
	clear
	insheet using OrigData/Census2011postcode/`xx'.csv, comma
	drop v1
	drop if _n<3

	drop if _n==2
	drop if _n> = 667
	replace v2 = "A" if _n==1
	export excel using OrigData/Census2011postcode/`xx'_postcode_clean.xls, replace
	
	clear
	
	import excel using OrigData/Census2011postcode/`xx'_postcode_clean.xls, firstrow
	missings dropvars, force

	label variable A "postcode" 
	
	foreach var of varlist _all   {
        local x : variable label `var'
        local new_name = strtoname(abbrev("`x'",32))
        rename `var' `new_name'
		}
		
	drop if postcode=="Total"
	replace postcode = substr(postcode,1,4)
	destring, replace
	label data `xx'
	save OrigData/Census2011postcode/`xx'_postcode.dta, replace
	
	clear
}




** some files downloaded cross border postcodes separately  - 2006
foreach xx in Age Bedrooms Education Employment Income Mortgage Rent Schooling Tenure {

clear	
	import excel using OrigData/Census2011postcode/`xx'_Crossing.xlsx
	
		drop A
		drop if _n<9
		drop if _n==2 
		drop if strpos(B,"crosses")>0 & strpos(B,"Vic") ==0
		drop if strpos(B,"Cells")>0
		drop if strpos(B,"Total")>0
		
export excel using OrigData/Census2011postcode/`xx'_Crossing_postcode_clean.xls, replace
	
		clear
	
import excel using OrigData/Census2011postcode/`xx'_Crossing_postcode_clean.xls, firstrow
		missings dropvars, force

		label variable A "postcode" 
		
		foreach var of varlist _all   {
			local x : variable label `var'
			local new_name = strtoname(abbrev("`x'",32))
			rename `var' `new_name'
			}
		
	drop if postcode=="Total"
	replace postcode = substr(postcode,1,4)
	destring, replace
	drop if postcode ==.
	
	append using OrigData/Census2011postcode/`xx'_postcode.dta
	save OrigData/Census2011postcode/`xx'_postcode.dta, replace
	}




*2. calculate average income for each postcode

* a. personal income using imputed values from ABS: http://www.abs.gov.au/websitedbs/censushome.nsf/home/factsheetsuid?opendocument&navpos=450

use OrigData/Census2011postcode/Income_postcode.dta, clear


	rename Negative_income Inc_1
	rename Nil_income Inc_2
	rename  _1__199___1__10_399_ Inc_3
	rename _200__299___10_400__15_599_ Inc_4
	rename _300__399___15_600__20_799_ Inc_5
	rename _400__599___20_800__31_199_ Inc_6
	rename _600__799___31_200__41_599_ Inc_7
	rename _800__999___41_600__51_999_ Inc_8
	rename _1_000__1_249___52_000__64_999_ Inc_9
	rename _1_250__1_499___65_000__77_999_ Inc_10
	rename _1_500__1_999___78_000__103_999_ Inc_11
	rename _2_000_or_more___104_000_or_more Inc_12

	
	
	gen val_inc1 = Inc_1 *(0)
	gen val_inc2 = Inc_2 *0 
	gen val_inc3 = Inc_3 *100
	gen val_inc4 = Inc_4 *250
	gen val_inc5 = Inc_5 *350
	gen val_inc6 = Inc_6 *500
	gen val_inc7 = Inc_7 *700
	gen val_inc8 = Inc_8 *900
	gen val_inc9 = Inc_9 *1125
	gen val_inc10 = Inc_10 *1375
	gen val_inc11 = Inc_11 *1750
	gen val_inc12 = Inc_12 *2500
	
	
	egen N_inc = rowtotal(Inc_*)

	egen Total_val_inc = rowtotal(val_*)

	* find the bucket of the median 
	gen N_median = N_inc/2
	gen N_75perc = 0.75*N_inc
	
	gen median_income = 0
	gen perc75_income = 0
	egen row_sum1 = rowtotal(Inc_1 Inc_2)
	
	
	forval x = 1(1)10{
		local y = `x' + 1 
		local z = `x' + 2
		egen row_sum`y' = rowtotal(row_sum`x' Inc_`z')
		replace median_income = (val_inc`z')/Inc_`z' if N_median > row_sum`x' & N_median <= row_sum`y'
		replace perc75_income = (val_inc`z')/Inc_`z' if N_75perc > row_sum`x' & N_75perc <= row_sum`y'
		}
		
	
	gen av_income = Total_val_inc/N_inc

	label variable av_income "Mean weekly income (postcode)"
	label variable median_income "Median weekly income (postcode)"
	label variable perc75_income "75th percentile weekly income (postcode)"
	
	keep postcode av_income median_income perc75_income

save OrigData/Census2011postcode/AverageIncome_postcode.dta, replace

* b. household income 


use OrigData/Census2011postcode/HH_income_postcode.dta, clear

	rename Negative_Income Income_neg
	rename Nil_Income Income_0
	rename _1__199 Income_1
	rename _200__ Income_2
	rename _300__ Income_3
	rename _400__ Income_4
	rename _600__ Income_5
	rename _800__ Income_6
	rename _1_000 Income_7
	rename _1_250_ Income_8
	rename _1_500 Income_9
	rename _2_000 Income_10
	rename _2_500 Income_11
	rename _3_000 Income_12
	rename _3_500 Income_13
	rename _4_000 Income_14
	rename _5_000_ Income_15

	capture drop Total

	* not creating variable for negative income or income >5000
	gen val_inc0 = 0
	gen val_inc1 = 100
	gen val_inc2 = 250
	gen val_inc3 = 350
	gen val_inc4 = 500
	gen val_inc5 = 700
	gen val_inc6 = 900
	gen val_inc7 = 1125
	gen val_inc8 = 1375
	gen val_inc9 = 1750
	gen val_inc10 = 2250
	gen val_inc11 = 2750
	gen val_inc12 = 3250
	gen val_inc13 = 3750
	gen val_inc14 = 4500

	egen N_inc = rowtotal(Income_*)

	gen N_median = N_inc/2
	gen N_75perc = 0.75*N_inc
		
	gen median_HHincome = 0
	gen perc75_HHincome = 0
	egen row_sum1 = rowtotal(Income_neg Income_0)
		
	forval x = 1(1)14{
			local y = `x' + 1 
			local z = `x' + 2
			egen row_sum`y' = rowtotal(row_sum`x' Income_`x')
			replace median_HHincome = (val_inc`x') if N_median > row_sum`x' & N_median <= row_sum`y'
			replace perc75_HHincome = (val_inc`x') if N_75perc > row_sum`x' & N_75perc <= row_sum`y'
		}
			
	keep median_HHincome perc75_HHincome postcode
	
			
save OrigData/Census2011postcode/HHIncome_postcode.dta, replace



*3. construct average age variable (note that this ignores right censoring at 100 years)

use OrigData/Census2011postcode/Age_postcode.dta, clear
	capture drop Total
	gen av_age=.
	egen Total =rowtotal(_*)

	mata:  
		yearsall=(0..100)'
		ages=st_data(.,2..102)
		Total = st_data(.,104)
		Calc_av=((ages*yearsall):/Total)
		st_store(.,("av_age"),Calc_av)
	end
	
	rename _0_years year0
	rename _1_year year1
	forval x = 2(1)100 {
		rename _`x'_years year`x'
		}
	
	
	* find the bucket of the median 
	gen N_median = Total/2
	gen N_75perc = 0.75*Total
	
	gen median_age = 0
	gen perc75_age = 0
	egen row_sum1 = rowtotal(year0 year1)
	
	
	forval x = 1(1)99{
		local y = `x' + 1 
		local z = `x' + 2
		egen row_sum`y' = rowtotal(row_sum`x' year`y')
		replace median_age = `y' if N_median > row_sum`x' & N_median < row_sum`y'
		replace perc75_age = `y' if N_75perc > row_sum`x' & N_75perc < row_sum`y'
		}
 	
	keep postcode av_age median_ perc75_

save OrigData/Census2011postcode/AverageAge_postcode.dta, replace

* 4. construct proportion of homes with no children
 
use OrigData/Census2011postcode/Children_postcode.dta, clear
 	capture drop Total

	egen Total=rowtotal(*_children One_child)
	gen PNoChildren = No_children/Total
	gen PChildren = 1- PNoChildren
	label variable PChildren "Proportion of households with children"
	 
	keep postcode PChildren 
	
	
 
 
save OrigData/Census2011postcode/PChildren_postcode.dta, replace

 * 5. construct proportion of houses that are separate

use OrigData/Census2011postcode/DwellingStructure_postcode.dta, clear

	gen TotalHouse = Total-Not_stated-Not_applicable
	gen TotalHouse_allresponse = Total - Not_applicable
	gen PSeparateHouse = Separate_house/(TotalHouse)
	gen PSemiDetached = (Semi_detached__one + Semi_detached__two_or_more)/(TotalHouse)
	gen Separate = PSeparate *TotalHouse
	
	keep postcode PSeparateHouse Separate PSemiDetached TotalHouse*
 
save OrigData/Census2011postcode/PSeparate_postcode.dta, replace
 
	clear
import excel using OrigData/Census2011Postcode/Dwellingstructure_Crossing.xlsx, cellrange(B9:O25) firstrow
	drop if _n==1

	drop if strpos(B,"crosses")>0 & strpos(B,"Vic") ==0
	drop if strpos(B,"Cells")>0
	drop if strpos(B,"Total")>0
	rename B postcode
	destring postcode, ignore("," "Vic" "./NSW" "crosses") replace
	capture drop Total
	rename Separatehouse Separate
	rename Semidetachedroworterraceho Semi_detached__one
	rename E Semi_detached__two_or_more
	
	egen Total = rowtotal(Separate - Notapplicable) 
	gen TotalHouse = Total-Notstated-Notapplicable
	gen TotalHouse_allresponse = Total - Notapplicable
	gen PSeparateHouse = Separate/(TotalHouse)
	gen PSemiDetached = (Semi_detached__one + Semi_detached__two_or_more)/(TotalHouse)
	
	keep postcode PSeparateHouse PSemiDetached TotalHouse* Separate
 
	append using OrigData/Census2011postcode/PSeparate_postcode.dta
	drop if postcode==.
	save OrigData/Census2011postcode/PSeparate_postcode.dta, replace
 
 
 
 
 
 
 
 * 6. construct proportion working full time
 
use OrigData/Census2011postcode/Employment_postcode.dta, clear

	gen PFullTime = Employed__worked_full_time/(Total-Not_stated )
 
	keep postcode PFullTime
 
save OrigData/Census2011postcode/PFullTime_postcode.dta, replace
 
 * 7. construct proportion non family households
  
use OrigData/Census2011postcode/HouseholdType_postcode.dta, clear

	gen PNonFamily = Non_family_household/(Total-Not_applicable)
 
	keep postcode PNonFamily
 
save OrigData/Census2011postcode/PNonFamily_postcode.dta, replace
 
 * 8. proportion professionals 
 
use  OrigData/Census2011postcode/Occupation_postcode.dta, clear
	capture drop Total

	egen Total = rowtotal(Managers Professionals Technicians Community Clerical Sales Machinery Labourers)
	gen PProf = Professionals/Total
 
	keep postcode PProf
 
save OrigData/Census2011postcode/PProf_postcode.dta, replace
 
 * 9. number of students
use  OrigData/Census2011postcode/Student_postcode.dta, clear
	gen NStudents=Total

	keep postcode NStudents

save OrigData/Census2011postcode/NStudents_postcode.dta, replace

 * 10. tenure
use  OrigData/Census2011postcode/Tenure_postcode.dta, clear
	capture drop Total
	egen Total = rowtotal( Owned_outright Owned_with_a_mortgage Being_purchased_under_a_rent_buy Rented Being_occupied_rent_free Being_occupied_under_a_life_tenu Other_tenure_type)

	gen POwnedOutright= Owned_outright/Total
	gen PRented = Rented/Total
	  
	keep POwnedOutright PRented postcode
  
save OrigData/Census2011postcode/POwnership_postcode.dta, replace
 
 * 11 vehicles
 
use  OrigData/Census2011postcode/Vehicles_postcode.dta, clear
	capture drop Total
	egen Total = rowtotal(  No_motor_vehicles One_motor_vehicle Two_motor_vehicles Three_motor_vehicles  Four_or_more_motor_vehicles)

	gen PTwoVehicles = Two_motor_vehicles/Total
	gen PNoVehicles = No_motor/Total
	gen PThreeMoreVehicles = (Three_motor_vehicles+Four_or_more_motor_vehicles)/Total
	 
	keep P* postcode
 
save OrigData/Census2011postcode/PVehicles_postcode.dta, replace

 
 * 12 bedrooms
 
 use  OrigData/Census2011postcode/Bedrooms_postcode.dta, clear

	capture drop Total
	egen Total = rowtotal(  None One_bedroom Two_bedrooms Three_bedrooms Four_bedrooms Five_bedrooms Six_bedrooms)

	gen POne_bedroom = One_bedroom/Total 
	gen PTwo_bedrooms = Two_bedrooms/Total 
	gen PThree_bedrooms = Three_bedrooms/Total
	gen PFour_bedrooms = Four_bedrooms/Total
	gen PFive_bedrooms = Five_bedrooms/Total
	gen PSix_bedrooms = Six_bedrooms/Total
	gen PFiveormorebedrooms = PFive_bedrooms+ PSix_bedrooms
 
	rename None room_0
	rename One room_1
	rename Two room_2
	rename Three room_3
	rename Four room_4
	rename Five room_5
	rename Six room_6
	
	gen val_room1 = room_1 *1
	gen val_room2 = room_2 *2 
	gen val_room3 = room_3 *3
	gen val_room4 = room_4 *4
	gen val_room5 = room_5 *5
	gen val_room6 = room_6 *6

	egen N_room = rowtotal(room_0-room_6)

	egen Total_val_room = rowtotal(val_*)
	gen av_rooms = Total_val_room/N_room

	* find the median 
	gen N_median = N_room/2
	gen N_75perc = 0.75*N_room
	
	gen median_rooms = 0
	gen perc75_rooms = 0
	
	egen row_sum1 = rowtotal(room_0 room_1)
	
	
	forval x = 1(1)5{
		local y = `x' + 1 
		egen row_sum`y' = rowtotal(row_sum`x' room_`y')
		replace median_rooms = (val_room`y')/room_`y' if N_median > row_sum`x' & N_median <= row_sum`y'
		replace perc75_rooms = (val_room`y')/room_`y' if N_75perc > row_sum`x' & N_75perc <= row_sum`y'
		}
		
	label variable av_rooms "Mean bedrooms"
	label variable median_rooms "Median bedrooms"
	label variable perc75_rooms "75th percentile bedrooms"
 
 
	keep P* postcode median_rooms av_rooms perc75_rooms
 
 save OrigData/Census2011postcode/PBedrooms_postcode.dta, replace

 * 13. Mortgage repayments
 
use OrigData/Census2011postcode/Mortgage_postcode.dta, clear
 
	gen N_mortgage = Total-Not_applicable - Not_stated

	rename Nil pay0
	rename _1__ pay1
	rename _150__ pay2
	rename _300_ pay3
	rename _450_ pay4
	rename _600_ pay5
	rename _800_ pay6
	rename _1_000_ pay7
	rename _1_200_ pay8
	rename _1_400_ pay9
	rename _1_600_ pay10
	rename _1_800_ pay11
	rename _2_000_ pay12
	rename _2_200_ pay13
	rename _2_400_ pay14
	rename _2_600_ pay15
	rename _3_000_ pay16
	rename _4_000_ pay17
	rename _5000_ pay18
	
	gen repayment0 = 0 
	gen repayment1 =75
	gen repayment2 =225
	gen repayment3 =375
	gen repayment4 =425
	gen repayment5 =700
	gen repayment6 =900
	gen repayment7 =1100
	gen repayment8 =1300
	gen repayment9 =1500
	gen repayment10 =1700
	gen repayment11 =1900
	gen repayment12 =2100
	gen repayment13 =2300
	gen repayment14 =2500
	gen repayment15 =2800
	gen repayment16 =3500
	gen repayment17 =4500
	gen repayment18 =5000
	
	
* find the bucket of the median 
	gen N_median = N_mortgage/2
	gen N_75perc = 0.75* N_mortgage
	
	gen median_mortgage = 0
	gen perc75_mortgage = 0
	
	egen row_sum1 = rowtotal(pay0 pay1)

	
	forval x = 1(1)17 {
		local y = `x' + 1 
		egen row_sum`y' = rowtotal(row_sum`x' pay`y')
		replace median_mortgage = repayment`y' if N_median > row_sum`x' & N_median <= row_sum`y'
		replace perc75_mortgage = repayment`y' if N_75perc > row_sum`x' & N_75perc <= row_sum`y'
		}
 	
	
	
	keep postcode median_ perc75_

save OrigData/Census2011postcode/MedianMortgage_postcode.dta, replace

 
 
 * 14. Rent per week 
 
use OrigData/Census2011postcode/Rent_postcode.dta, clear

	gen N_rent = Total-Not_applicable - Not_stated

	
	rename Nil pay0
	rename _1__ pay1
	rename _75__ pay2
	rename _100_ pay3
	rename _125_ pay4
	rename _150_ pay5
	rename _175_ pay6
	rename _200_ pay7
	rename _225_ pay8
	rename _250_ pay9
	rename _275_ pay10
	rename _300_ pay11
	rename _325_ pay12
	rename _350_ pay13
	rename _375_ pay14
	rename _400_ pay15
	rename _425_ pay16
	rename _450_ pay17
	rename _550_ pay18
	rename _650_ pay19
	
	gen rent0 = 0 
	gen rent2 = 37
	forval x = 75(25)425 {
		local y = `x'/25 
		gen rent`y' = `x'+12.5

		}
	
	gen rent18 =600
	gen rent19 =750
	
	gen N_median = N_rent/2
	gen N_75perc = 0.75* N_rent
	
	gen median_rent = 0
	gen perc75_rent = 0
	
	egen row_sum1 = rowtotal(pay0 pay1)

	
	forval x = 1(1)18 {
		local y = `x' + 1 
		egen row_sum`y' = rowtotal(row_sum`x' pay`y')
		replace median_rent = rent`y' if N_median > row_sum`x' & N_median <= row_sum`y'
		replace perc75_rent = rent`y' if N_75perc > row_sum`x' & N_75perc <= row_sum`y'
		}
 	
	keep postcode median_ perc75_

save OrigData/Census2011postcode/MedianRent_postcode.dta, replace



 use OrigData/Census2011postcode/Education_postcode.dta, clear

 
 gen total_applicable = Total  - Level_of_education_not_stated - Level_of_education_inadequately 
 
 gen PBachelor =(Postgraduate_Degree + Graduate_Diploma + Bachelor)/total_applicable
 
 keep postcode PBachelor 
 
 save OrigData/Census2011postcode/PBachelor_postcode.dta, replace
 
 

 
 
 
 
 
 
**************************************************************

global files1 AverageIncome HHIncome AverageAge PChildren PSeparate PFullTime PNonFamily PProf NStudents POwnership PVehicles PBedrooms MedianMortgage MedianRent


*. merge census dta files

foreach xx of global files1 {
	
	merge 1:1 postcode using OrigData/Census2011postcode/`xx'_postcode.dta, nogenerate
	
	erase OrigData/Census2011postcode/`xx'_postcode.dta
	}


sort postcode
label data Census11_postcode
gen year =2011

* census is taken in august - quarter 3. 

gen contract_quarter = tq(2011q3)
format contract_quarter %tq


gen contract_start_month = tm(2011m8) 


save Data/Census2011postcode.dta, replace




